{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### INTERVAL CADD-SV input bed files\n",
    "\n",
    "* SV VCF variant file and accompanying variant ID list is one-based \n",
    "* Focus on duplications and deletions \n",
    "* May need to convert MEI -> INS but not sure \n",
    "* Remove any other SV types\n",
    "* Required to write ~5,000 SVs per file "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import pandas as pd\n",
    "import glob\n",
    "from pathlib import Path \n",
    "import pysam\n",
    "\n",
    "# add location of classes to path\n",
    "sys.path.append(\"/lustre/scratch119/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/INTERVAL-seq_rare_variants\")\n",
    "from constants.constants import WORKING_DIR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate SV bed files split by 5000 variants and associated config.yml file \n",
    "lines_per_file = 5000\n",
    "bed_out = None\n",
    "with open(sv_info_path) as f_in:\n",
    "    for lineno, line in enumerate(f_in):\n",
    "        if lineno % lines_per_file == 0:\n",
    "            if bed_out:\n",
    "                bed_out.close()\n",
    "            bed_out_path = f\"{WORKING_DIR}software/CADD-SV/input/id_intrvl_svs_{lineno+lines_per_file}.bed\"\n",
    "            bed_out = open(bed_out_path, \"w\")\n",
    "            # generate input config\n",
    "            config_out_path = f\"{WORKING_DIR}software/CADD-SV/input_configs/config_{lineno+lines_per_file}.yml\"\n",
    "            config_out = open(config_out_path, \"w\")\n",
    "            config_out.write(f\"---\\ndataset:\\n        - intrvl_svs_{lineno+lines_per_file}\\n...\\n\")\n",
    "            config_out.close()\n",
    "        if line.startswith(\"plinkID\"):\n",
    "            continue\n",
    "        else: \n",
    "            chrom, start, end = line.split(\"\\t\")[2], line.split(\"\\t\")[3], line.split(\"\\t\")[4]\n",
    "            sv_type = line.split(\"\\t\")[5]\n",
    "            # convert to zero-based coordinates\n",
    "            start = int(start) - 1  \n",
    "            bed_out.write(f\"{chrom}\\t{start}\\t{end}\\t{sv_type}\\n\")\n",
    "    if bed_out:\n",
    "        bed_out.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input_configs/config_115000.yml\n",
      "input_configs/config_110000.yml\n",
      "input_configs/config_20000.yml\n",
      "input_configs/config_30000.yml\n",
      "input_configs/config_80000.yml\n",
      "input_configs/config_65000.yml\n",
      "input_configs/config_90000.yml\n",
      "input_configs/config_75000.yml\n",
      "input_configs/config_95000.yml\n",
      "input_configs/config_50000.yml\n",
      "input_configs/config_45000.yml\n",
      "input_configs/config_105000.yml\n",
      "input_configs/config_100000.yml\n",
      "input_configs/config_40000.yml\n",
      "input_configs/config_60000.yml\n",
      "input_configs/config_15000.yml\n",
      "input_configs/config_5000.yml\n",
      "input_configs/config_10000.yml\n",
      "input_configs/config_55000.yml\n",
      "input_configs/config_120000.yml\n",
      "input_configs/config_35000.yml\n",
      "input_configs/config_125000.yml\n",
      "input_configs/config_70000.yml\n",
      "input_configs/config_25000.yml\n",
      "input_configs/config_85000.yml\n"
     ]
    }
   ],
   "source": [
    "# generate list of config files \n",
    "config_path_list = glob.glob(f\"{WORKING_DIR}software/CADD-SV/input_configs/config_*\")\n",
    "config_paths_out = f\"{WORKING_DIR}/INTERVAL-seq_rare_variants/derepression/scripts/cadd_sv/interval_svs/config_paths.txt\"\n",
    "with open(config_paths_out, \"w\") as out: \n",
    "    for path in config_path_list: \n",
    "        print(\"/\".join(path.split(\"/\")[-2:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_output_path = f\"/lustre/scratch119/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/software/CADD-SV/v1.1_710aa5ad42b7ede7702e133ac86e39dd.tsv\"\n",
    "test_output_df = pd.read_csv(test_output_path, sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>## CADD-SV v1.1 (GRCh38 coordinates)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>#Chrom</th>\n",
       "      <th>Start</th>\n",
       "      <th>End</th>\n",
       "      <th>Type</th>\n",
       "      <th>Name</th>\n",
       "      <th>CADD-SV-score</th>\n",
       "      <th>Raw-Score_span</th>\n",
       "      <td>Raw-Score_flank</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">1</th>\n",
       "      <th>213941196</th>\n",
       "      <th>213942363</th>\n",
       "      <th>DEL</th>\n",
       "      <th>.</th>\n",
       "      <th>10.42</th>\n",
       "      <th>0.642010177102381</th>\n",
       "      <td>0.642010177102381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>213942363</th>\n",
       "      <th>213943530</th>\n",
       "      <th>DEL</th>\n",
       "      <th>.</th>\n",
       "      <th>8.521</th>\n",
       "      <th>0.571019241242962</th>\n",
       "      <td>0.571019241242962</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>213943530</th>\n",
       "      <th>213944697</th>\n",
       "      <th>INS</th>\n",
       "      <th>.</th>\n",
       "      <th>8.852</th>\n",
       "      <th>0.400410057227931</th>\n",
       "      <td>0.400410057227931</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>158364697</th>\n",
       "      <th>158365864</th>\n",
       "      <th>DUP</th>\n",
       "      <th>.</th>\n",
       "      <th>3.578</th>\n",
       "      <th>0.0694860755356306</th>\n",
       "      <td>-0.055765486328298</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                      ## CADD-SV v1.1 (GRCh38 coordinates)\n",
       "#Chrom Start     End       Type Name CADD-SV-score Raw-Score_span                          Raw-Score_flank\n",
       "1      213941196 213942363 DEL  .    10.42         0.642010177102381                     0.642010177102381\n",
       "       213942363 213943530 DEL  .    8.521         0.571019241242962                     0.571019241242962\n",
       "       213943530 213944697 INS  .    8.852         0.400410057227931                     0.400410057227931\n",
       "2      158364697 158365864 DUP  .    3.578         0.0694860755356306                   -0.055765486328298"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_output_df "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
